Combining Simulation Results

library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.0.5
#read in all files
setwd("~/Desktop/Cluster/OOSEst/8Nov2023")
files <- list.files(pattern=".Rdata")

all_res <- target_res <- c()
for (i in 1:length(files)) {
  load(file = files[i])
  
  #all metrics
  all_res_ind <- results$all_res
  all_res <- bind_rows(all_res, all_res_ind)
  
  #target metrics
  target_res_ind <- results$target_res %>%
    mutate(K = unique(all_res_ind$K),
           n_mean = unique(all_res_ind$n_mean),
           n_sd = unique(all_res_ind$n_sd),
           covars_fix = unique(all_res_ind$covars_fix),
           covars_rand = unique(all_res_ind$covars_rand),
           lin = unique(all_res_ind$lin),
           eps_study_m = unique(all_res_ind$eps_study_m),
           eps_study_tau = unique(all_res_ind$eps_study_tau),
           eps_study_inter = unique(all_res_ind$eps_study_inter),
           distribution = unique(all_res_ind$distribution))
  target_res <- bind_rows(target_res, target_res_ind)
  
  print(i)
}

#checking output
seeds <- strsplit(files,"_") %>% map_chr(.f=function(x) x[2])
length(unique(seeds)) == length(files) #making sure every iteration has a unique seed

nrow(all_res) #to make sure every iteration ran
nrow(target_res)/800
setwd("~/Dropbox/Moderation/Out of Sample Prediction/Data")
saveRDS(all_res, "all_res_8Nov2023.RDS")
saveRDS(target_res, "target_res_8Nov2023.RDS")
all_res <- readRDS("Data/all_res_8Nov2023.RDS")
target_res <- readRDS("Data/target_res_8Nov2023.RDS")

#summarise across iterations
target_ind <- target_res %>%
  group_by(id, W, sex, smstat, age, age2, weight, madrs, Y, tau, method,
           K, n_mean, n_sd, covars_fix, covars_rand, lin, eps_study_m,
           eps_study_tau, eps_study_inter, distribution) %>%
  summarise(n_iter = n(),
            coverage = mean(coverage),
            bias = mean(bias),
            abs_bias = mean(abs_bias),
            length = mean(length),
            significant = mean(significant)) %>%
  mutate(scenario = paste(covars_fix, covars_rand, lin, eps_study_m,
                          eps_study_tau, eps_study_inter, sep = "_"),
         sds = paste(eps_study_m, eps_study_tau, eps_study_inter, sep = "_"),
         Linear = ifelse(lin == T, "Linear", "Non-Linear"))

#average across target sample
target_sum <- target_ind %>%
  group_by(method,
           K, n_mean, n_sd, covars_fix, covars_rand, lin, eps_study_m,
           eps_study_tau, eps_study_inter, distribution) %>%
  summarise(across(coverage:significant, list(mean = mean, sd = sd))) %>%
  mutate(scenario = paste(covars_fix, covars_rand, lin, eps_study_m,
                          eps_study_tau, eps_study_inter, sep = "_"),
         sds = paste(eps_study_m, eps_study_tau, eps_study_inter, sep = "_"),
         Linear = ifelse(lin == T, "Linear", "Non-Linear"))

#long form for boxplot
all_long <- all_res %>%
  rename(MA = manual_res, MA_Inc = manual_res_wrong,
         HCF = cf_res, ACF = cf_a_res,
         HCF_Rand = cf_res_rand, ACF_Rand = cf_a_res_rand,
         SBART = sb_res, SBART_Rand = sb_res_rand) %>%
  pivot_longer(cols = MA:SBART_Rand,
               names_to = "Method",
               values_to = "Value") %>%
  mutate(scenario = paste(covars_fix, covars_rand, lin, eps_study_m,
                          eps_study_tau, eps_study_inter, sep = "_"),
         sds = paste(eps_study_m, eps_study_tau, eps_study_inter, sep = "_"),
         Dataset = factor(ifelse(grepl("target", Metric) == T,
                                 "Target", "Training"),
                          levels = c("Training", "Target")),
         Linear = ifelse(lin == T, "Linear", "Non-Linear"))

#average across iterations
all_sum <- all_res %>%
  rename(MA = manual_res, MA_Inc = manual_res_wrong,
         HCF = cf_res, ACF = cf_a_res,
         HCF_Rand = cf_res_rand, ACF_Rand = cf_a_res_rand,
         SBART = sb_res, SBART_Rand = sb_res_rand) %>%
  group_by(Metric, K, n_mean, n_sd, covars_fix, covars_rand,
           lin, eps_study_m, eps_study_tau, eps_study_inter,
           distribution) %>%
  summarise(across(MA:SBART_Rand, list(mean = mean, sd = sd)),
            n_iter = n())
summary(all_sum$n_iter) #check iterations
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100     100     100     100     100     100
#average in long form
all_avg_long <- all_res %>%
  rename(MA = manual_res, MA_Inc = manual_res_wrong,
         HCF = cf_res, ACF = cf_a_res,
         HCF_Rand = cf_res_rand, ACF_Rand = cf_a_res_rand,
         SBART = sb_res, SBART_Rand = sb_res_rand) %>%
  group_by(Metric, K, n_mean, n_sd, covars_fix, covars_rand,
           lin, eps_study_m, eps_study_tau, eps_study_inter,
           distribution) %>%
  summarise(across(MA:SBART_Rand, mean)) %>%
  pivot_longer(cols = MA:SBART_Rand,
               names_to = "Method",
               values_to = "Value") %>%
  mutate(scenario = paste(covars_fix, covars_rand, lin, eps_study_m,
                          eps_study_tau, eps_study_inter, sep = "_"),
         sds = paste(eps_study_m, eps_study_tau, eps_study_inter, sep = "_"))

TARGET EVAL: Age Moderator, Same Training Distribution

Coverage

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=coverage, color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

#average across all iterations
target_sum %>%
  filter(covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=method, group=method)) +
  geom_point(aes(y=coverage_mean, color=method)) +
  # geom_errorbar(aes(ymin = coverage_mean - 1.96*coverage_sd,
  #                   ymax = min(coverage_mean + 1.96*coverage_sd, 1),
  #                   color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds) +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

Length

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=length, color=method)) +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("CI Length in Target Sample")

#average across all iterations
target_sum %>%
  filter(covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=method, group=method)) +
  geom_point(aes(y=length_mean, color=method)) +
  # geom_errorbar(aes(ymin = coverage_mean - 1.96*coverage_sd,
  #                   ymax = min(coverage_mean + 1.96*coverage_sd, 1),
  #                   color=method)) +
  facet_grid(Linear ~ sds) +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("CI Length in Target Sample")

Bias

target_ind %>%
  filter(covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=abs_bias, color=method)) +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("Absolute Bias in Target Sample")

TARGET EVAL: Other Settings

Age Moderator, Varying MADRS in Training Distributions

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age",
         distribution == "varying_madrs") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=coverage, color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

Age Moderator, Varying Age in Training Distributions

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age",
         distribution == "separate_age") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=coverage, color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

Age and MADRS Moderator, Same Training Distributions

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age, madrs",
         distribution == "same") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=coverage, color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

Age and MADRS Moderator, Varying MADRS in Training Distributions

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age, madrs",
         distribution == "varying_madrs") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=coverage, color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

Age and MADRS Moderator, Separate Age in Training Distributions

#boxplot across 100 people in target sample
target_ind %>%
  filter(covars_fix == "age, madrs",
         distribution == "separate_age") %>%
  ggplot(aes(x=method, group=method)) +
  geom_boxplot(aes(y=coverage, color=method)) +
  geom_hline(aes(yintercept = 0.95), linetype = "dashed") +
  facet_grid(Linear ~ sds, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  ylab("Coverage in Target Sample")

Results by Method

For now, just using the setup where age is the fixed and random moderator, the training distributions are the same across trials, and the target distribution is the same as the training.

Meta-Analysis

#age, same target distribution
#mse
all_long %>%
  filter(Method %in% c("MA","MA_Inc"), covars_fix == "age",
         distribution == "same",
         grepl("mse", Metric) == T,
         sds != "1_1_0.5") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  facet_grid(Dataset~Linear, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("MSE")

#ggsave("Results/19Oct2023/MA_MSE_19Oct2023.jpeg")

#coverage
all_long %>%
  filter(Method %in% c("MA","MA_Inc"), covars_fix == "age",
         distribution == "same",
         grepl("coverage", Metric) == T,
         sds != "1_1_0.5") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_grid(Dataset~Linear, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent") 

#ggsave("Results/19Oct2023/MA_Cov_19Oct2023.jpeg")

Causal Forest

#age, same target distribution
#mse
all_long %>%
  filter(Method %in% c("HCF","ACF","HCF_Rand","ACF_Rand"), covars_fix == "age",
         distribution == "same",
         grepl("mse", Metric) == T,
         sds != "1_1_0.5") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  facet_grid(Dataset~Linear, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("MSE")

#ggsave("Results/19Oct2023/CF_MSE_19Oct2023.jpeg")

#coverage
all_long %>%
  filter(Method %in% c("HCF","ACF","HCF_Rand","ACF_Rand"), covars_fix == "age",
         distribution == "same",
         grepl("coverage", Metric) == T,
         sds != "1_1_0.5") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_grid(Dataset~Linear, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent") 

#ggsave("Results/19Oct2023/CF_Cov_19Oct2023.jpeg")

BART

#age, same target distribution
#mse
all_long %>%
  filter(Method %in% c("SBART","SBART_Rand"), covars_fix == "age",
         distribution == "same",
         grepl("mse", Metric) == T,
         sds != "1_1_0.5") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  facet_grid(Dataset~Linear, scales = "free") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("MSE")

#ggsave("Results/19Oct2023/BART_MSE_19Oct2023.jpeg")

#coverage
all_long %>%
  filter(Method %in% c("SBART","SBART_Rand"), covars_fix == "age",
         distribution == "same",
         grepl("coverage", Metric) == T,
         sds != "1_1_0.5") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_grid(Dataset~Linear, scales = "free") +
  scale_y_continuous(labels = scales::percent) +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent") 

#ggsave("Results/19Oct2023/BART_Cov_19Oct2023.jpeg")

Old

Target CI Coverage

#age, same target distribution
all_long %>%
  filter(Metric == "target_coverage", covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_wrap(~lin) +
  scale_y_continuous(labels = scales::percent, limits = c(.75, 1)) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent in Target Sample") +
  ggtitle("Same Target Distribution")
## Warning: Removed 301 rows containing non-finite values (stat_boxplot).

# points version
all_avg_long %>%
  filter(Metric == "target_coverage", covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_point(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_wrap(~lin) +
  scale_y_continuous(labels = scales::percent, limits = c(.75, 1)) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent in Target Sample") +
  ggtitle("Same Target Distribution")
## Warning: Removed 4 rows containing missing values (geom_point).

Training CI Coverage

#age, same target distribution
all_long %>%
  filter(Metric == "train_coverage", covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_wrap(~lin) +
  scale_y_continuous(labels = scales::percent, limits = c(.75, 1)) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent in Training Sample") +
  ggtitle("Same Target Distribution")
## Warning: Removed 2305 rows containing non-finite values (stat_boxplot).

Target Bias

#age, same target distribution
all_long %>%
  filter(Metric == "target_bias", covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  facet_wrap(~lin) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("Mean Absolute Bias in Target Sample") +
  ggtitle("Same Target Distribution")

Training Bias

#age, same target distribution
all_long %>%
  filter(Metric == "train_bias", covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  facet_wrap(~lin) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("Mean Absolute Bias in Training Sample") +
  ggtitle("Same Target Distribution")

Target CI Length

#age, same target distribution
all_long %>%
  filter(Metric == "target_length", covars_fix == "age",
         distribution == "same") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  facet_wrap(~lin) + ylim(0,10) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Length in Target Sample") +
  ggtitle("Same Target Distribution")
## Warning: Removed 123 rows containing non-finite values (stat_boxplot).

Target CI Coverage

#age, same target distribution
all_long %>%
  filter(Metric == "target_coverage", covars_fix == "age") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_grid(lin~distribution) +
  scale_y_continuous(labels = scales::percent, limits = c(.75, 1)) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent in Target Sample") +
  ggtitle("Same Target Distribution")
## Warning: Removed 654 rows containing non-finite values (stat_boxplot).

all_long %>%
  filter(Metric == "target_coverage", covars_fix == "age, madrs") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_grid(lin~distribution) +
  scale_y_continuous(labels = scales::percent, limits = c(.75, 1)) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent in Target Sample") +
  ggtitle("Same Target Distribution, Age and MADRS")
## Warning: Removed 112 rows containing non-finite values (stat_boxplot).

Training CI Coverage

#age, same target distribution
all_long %>%
  filter(Metric == "train_coverage", covars_fix == "age") %>%
  ggplot(aes(x=sds, y=Value)) +
  geom_boxplot(aes(color = Method)) +
  geom_hline(aes(yintercept=.95), linetype = "dashed") +
  facet_grid(lin~distribution) +
  scale_y_continuous(labels = scales::percent, limits = c(.75, 1)) +
  labs(color="Method") +
  theme(axis.text.x = element_text(angle=90),
        text = element_text(size=12)) +
  xlab("Variability of Main, Trt, and Trt*Age Coefficients") +
  ylab("CI Coverage Percent in Training Sample") +
  ggtitle("Same Target Distribution")
## Warning: Removed 6693 rows containing non-finite values (stat_boxplot).